Análise exploratória de dados do IMDB sobre seriados de TV e Streaming. Os dados originais e as variáveis vêm deste repositorio . Lá consta a explicação de como os dados foram gerados e do significado de cada variável.
episodes <- read_csv(here("data/series_from_imdb.csv"),
progress = FALSE,
col_types = cols(.default = col_double(),
series_name = col_character(),
episode = col_character(),
url = col_character(),
season = col_character()))
episodes %>%
glimpse()
Observations: 32,070
Variables: 18
$ series_name <chr> "13 Reasons Why", "13 Reasons Why", "13 Reasons Why", "13 Reasons Why", "13 Reasons Why", "13 ...
$ episode <chr> "Tape 1, Side A", "Tape 1, Side B", "Tape 2, Side A", "Tape 2, Side B", "Tape 3, Side A", "Tap...
$ series_ep <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
$ season <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",...
$ season_ep <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
$ url <chr> "http://www.imdb.com/title/tt5174246/", "http://www.imdb.com/title/tt5174248/", "http://www.im...
$ user_rating <dbl> 8.5, 8.2, 8.1, 8.3, 8.5, 8.3, 8.6, 8.4, 8.9, 8.8, 9.3, 9.2, 9.4, 8.6, 8.3, 8.2, 8.7, 8.4, 8.6,...
$ user_votes <dbl> 3661, 3009, 2784, 2658, 2617, 2491, 2548, 2436, 2507, 2490, 3403, 2898, 4053, 2174, 1603, 1471...
$ r1 <dbl> 0.04143948, 0.04176334, 0.04446038, 0.05065666, 0.05718643, 0.05128205, 0.05951449, 0.06022122...
$ r2 <dbl> 0.003816794, 0.003646006, 0.003226963, 0.002251407, 0.002668700, 0.004407051, 0.004306969, 0.0...
$ r3 <dbl> 0.0032715376, 0.0046403712, 0.0046611689, 0.0030018762, 0.0022874571, 0.0020032051, 0.00313234...
$ r4 <dbl> 0.004634678, 0.006297647, 0.008246683, 0.005253283, 0.006099886, 0.010016026, 0.007830854, 0.0...
$ r5 <dbl> 0.011177754, 0.013258204, 0.019361778, 0.016510319, 0.013343500, 0.014823718, 0.013312451, 0.0...
$ r6 <dbl> 0.031079607, 0.036460060, 0.043743277, 0.038273921, 0.033930614, 0.045673077, 0.028582616, 0.0...
$ r7 <dbl> 0.09133043, 0.13059330, 0.13302259, 0.11031895, 0.09264201, 0.10576923, 0.08026625, 0.09954937...
$ r8 <dbl> 0.20692475, 0.27842227, 0.28002868, 0.25628518, 0.20243995, 0.26322115, 0.16679718, 0.21630479...
$ r9 <dbl> 0.2764449, 0.2031820, 0.1724632, 0.2112570, 0.2436142, 0.1875000, 0.2411903, 0.2105694, 0.2751...
$ r10 <dbl> 0.3298800, 0.2817368, 0.2907852, 0.3061914, 0.3457873, 0.3153045, 0.3950666, 0.3478083, 0.4432...
episodes <- left_join(episodes, sumario_simples,
by = c("series_name", "season")) %>%
group_by(series_name, season) %>%
mutate(middle_eps = (season_ep > p20) &
(season_ep < p80)) %>%
ungroup()
episodes
p <- episodes %>%
ggplot(aes(x = series_name, y = user_rating, color=middle_eps)) +
geom_jitter(width = 0.3, alpha=0.7) +
facet_wrap(~ season) +
coord_flip()
ggplotly(p)
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`